# Delete all variables
rm( list = ls() )
# Import libaries
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 4.1.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Import the dataframe PenguinsWithoutMissingValues from the csv file.
Use therefore the function read.csv() or read.csv2() depending on your system’s language (for a correct “.” and “,” interpretation)
# Import the dataframe PenguinsWithoutMissingValues from the csv file.
data = read.csv("C:/Users/Dari-Laptop/Desktop/FH Karnten - Master - AppDs/StatisticsAppDSLaptop/PenguinsWithoutMissingValues.csv")
# 1) How many obserations are there?
count(data)
There are 333 observations.
# 2) What are the names of the observed variables?
names(data)
## [1] "IndividualID" "Species" "Island"
## [4] "CulmenLength.mm." "CulmenDepth.mm." "FlipperLength.mm."
## [7] "BodyMass.g." "Gender"
The names of observed variables are:
Use therefore the library dplyr with the functions group_by() and summarize()
# 3) How many penguins per Species are there?
data %>% count(Species)
There are:
Use therefore the library dplyr with the functions group_by() and summarize()
# 4) How many penguins per Species and Gender are there?
data %>%
group_by(Species, Gender) %>%
summarise(n = n())
## `summarise()` has grouped output by 'Species'. You can override using the
## `.groups` argument.
There are:
Create a table with following information; Per Species and Gender:
Use therefore the library dplyr with the functions group_by() and summarize()
# 5) BodyMass statistics per Species and Gender
#
# Create a table with following information;
# Per Species and Gender:
#
# - the amount of observations,
# - minimum of BodyMass,
# - median of BodyMass,
# - average of BodyMass,
# - maximum of BodyMass.
data %>%
group_by(Species, Gender) %>%
summarise(Observations = n(),
Minimum = min(BodyMass.g.),
Median = median(BodyMass.g.),
Average = mean(BodyMass.g.),
Maximum = max(BodyMass.g.))
## `summarise()` has grouped output by 'Species'. You can override using the
## `.groups` argument.
This gives us the following results:
Create a table with following information; Per Island, Species and Gender:
Use therefore the library dplyr with the functions group_by() and summarize()
# 6. Quantitative statistics per Species and Gender?
# Create a table with following information; Per Island, Species and Gender:
#
# the amount of observations,
# average of CulmenLength.mm.,
# average of CulmenDepth.mm.,
# average of FlipperLength.mm.,
# average of BodyMass.g.
data %>%
group_by(Island, Species, Gender) %>%
summarise(Observations = n(),
CulmenLengthAvg = mean(CulmenLength.mm.),
CulmenDepthAvg = mean(CulmenDepth.mm.),
FlipperLengthAvg = mean(FlipperLength.mm.),
BodyMassAvg = mean(BodyMass.g.))
## `summarise()` has grouped output by 'Island', 'Species'. You can override using
## the `.groups` argument.
This gives us the following results:
# 7) Provide a summary statistics of the columns
summary(data)
## IndividualID Species Island CulmenLength.mm.
## Length:333 Length:333 Length:333 Min. :32.10
## Class :character Class :character Class :character 1st Qu.:39.50
## Mode :character Mode :character Mode :character Median :44.50
## Mean :43.99
## 3rd Qu.:48.60
## Max. :59.60
## CulmenDepth.mm. FlipperLength.mm. BodyMass.g. Gender
## Min. :13.10 Min. :172 Min. :2700 Length:333
## 1st Qu.:15.60 1st Qu.:190 1st Qu.:3550 Class :character
## Median :17.30 Median :197 Median :4050 Mode :character
## Mean :17.16 Mean :201 Mean :4207
## 3rd Qu.:18.70 3rd Qu.:213 3rd Qu.:4775
## Max. :21.50 Max. :231 Max. :6300
As we can see, summary statistics can be applied only to quatitative columns, providing us the minimum, quantiles and maximum.
## Pairs plot
ggpairs(data, columns=4:7, ggplot2::aes(colour=Species))
As we can see, there is a good correlation between:
ggplot(data, aes(x=CulmenLength.mm.)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data, aes(x=CulmenLength.mm., fill=Species)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data, aes(x=CulmenLength.mm., fill=Species)) +
geom_histogram() + facet_grid(.~Species)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data, aes(x=CulmenLength.mm., fill=Species)) +
geom_histogram() + facet_grid(Species ~ Gender)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data, aes(x=CulmenLength.mm., fill=Species)) +
geom_density() + facet_grid(.~Gender)
ggplot(data, aes(x=CulmenLength.mm., fill=Species)) +
geom_density() + facet_grid(.~Island)
Use a boxplot for CulmenLength.mm. for the different Species, facet_grid the Gender
Can you spot any outliers?
ggplot(data, aes(x=CulmenLength.mm., fill=Species)) +
geom_boxplot() + facet_grid(.~Gender)
Yes, there are outliers for:
Use a scatterplot for CulmenLength.mm. and BodyMass.g.
Create
ggplot(data, aes(x=CulmenLength.mm., y = BodyMass.g., color=Species)) +
geom_point() + facet_grid(.~Gender)
Create
Can you spot any outliers? What’s the IndividualID of the potential outlier?
p = ggplot(data, aes(x=CulmenLength.mm., y = BodyMass.g., color=Species, text = paste("IndividualID :", IndividualID))) +
geom_point() + facet_grid(.~Gender)
ggplotly(p)
Yes, there are some outliers in our scatter plots: